#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')
library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(expss)
library(polycor)
library(foreach) ; library(doParallel)
suppressWarnings(suppressMessages(library(WGCNA)))
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
Load preprocessed dataset (preprocessing code in 19_10_14_data_preprocessing.Rmd) and clustering (pipeline in 19_10_21_WGCNA.Rmd)
# Gandal dataset
load('./../Data/Gandal/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
# GO Neuronal annotations
GO_annotations = read.csv('./../../../PhD-InitialExperiments/FirstYearReview/Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# Clusterings
clusterings = read_csv('./../Data/Gandal/clusters.csv')
# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
left_join(GO_neuronal, by='ID') %>% left_join(clusterings, by='ID') %>%
mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`),
significant=padj<0.05 & !is.na(padj))
# Dataset created with DynamicTreeMerged algorithm
clustering_selected = 'DynamicHybrid'
dataset = read.csv(paste0('./../Data/Gandal/dataset_', clustering_selected, '.csv'))
mm = dataset %>% dplyr::select(starts_with('MM.'), starts_with('MMgray')) %>% dplyr::rename(MM.gray = MMgray)
rownames(mm) = dataset$ID
genes_info$Module = gsub('MM.', '', colnames(mm)[max.col(mm, ties.method='first')])
genes_info$Module[genes_info$Module!='gray'] = paste0('#',genes_info$Module[genes_info$Module!='gray'])
rm(DE_info, GO_annotations, clusterings, dds, GO_neuronal, dataset, mm)
*The colour of the modules is the arbitrary one assigned during the WGCNA algorithm, where the gray cluster actually represents all the genes that were left without a cluster (so it’s not actually a cluster).
Leaves more genes without a module than the original module assignment, but the module sizes are more balanced
cat(paste0('The maxMM assignment created ', length(unique(genes_info$Module))-1, ' modules and left ',
sum(genes_info$Module=='gray'), ' genes without a module.\n'))
## The maxMM assignment created 61 modules and left 322 genes without a module.
table(genes_info$Module)
##
## #00A5FF #00AAFE #00AEF9 #00B2F3 #00B6EC #00B930 #00B9E4 #00BB47 #00BBDC
## 121 203 222 362 331 128 255 131 164
## #00BC59 #00BDD3 #00BE69 #00BEC9 #00BF78 #00C085 #00C092 #00C0B5 #00C0BF
## 244 303 244 189 278 308 496 193 434
## #00C19E #00C1AA #13B700 #43B500 #48A0FF #5CB300 #6B9AFF #6FB000 #7FAE00
## 217 338 166 235 182 108 275 221 212
## #8594FF #8DAB00 #9AA800 #9B8EFF #A5A500 #AD87FF #AFA200 #B99E00 #BD81FF
## 276 496 497 243 244 267 230 228 227
## #C29B00 #CA7BFF #CA9700 #D19300 #D675FD #D88F00 #DF8B00 #E06FF8 #E58702
## 212 282 183 384 258 287 269 178 240
## #E96AF1 #EB8332 #F066EA #F07F4A #F47A5D #F663E2 #F8766D #FA62D9 #FB727C
## 440 164 514 624 204 165 138 357 347
## #FE61D0 #FE6E8A #FF61C5 #FF62BB #FF64AF #FF67A3 #FF6A97 gray
## 474 363 179 186 98 237 227 322
plot_data = table(genes_info$Module) %>% data.frame %>% arrange(desc(Freq))
ggplotly(plot_data %>% ggplot(aes(x=reorder(Var1, -Freq), y=Freq)) + geom_bar(stat='identity', fill=plot_data$Var1) +
ggtitle('Module size') + ylab('Number of genes') + xlab('Module') + theme_minimal() +
theme(axis.text.x = element_text(angle = 90)))
In the WGCNA documentation they use Pearson correlation to calculate correlations, I think all of their variables were continuous. Since I have categorical variables I’m going to use the hetcor function, that calculates Pearson, polyserial or polychoric correlations depending on the type of variables involved.
I’m not sure how the corPvalueStudent function calculates the p-values and I cannot find any documentation…
Compared correlations using Pearson correlation and with hetcor and they are very similar, but a bit more extreme with hetcor. The same thing happens with the p-values.
datTraits = datMeta %>% dplyr::select(Diagnosis_, Region, Sex, Age, PMI, RNAExtractionBatch) %>%
rename('Diagnosis' = Diagnosis_, 'ExtractionBatch' = RNAExtractionBatch)
# Recalculate MEs with color labels
ME_object = datExpr %>% t %>% moduleEigengenes(colors = genes_info$Module)
MEs = orderMEs(ME_object$eigengenes)
# Calculate correlation between eigengenes and the traits and their p-values
moduleTraitCor = MEs %>% apply(2, function(x) hetcor(x, datTraits)$correlations[1,-1]) %>% t
rownames(moduleTraitCor) = colnames(MEs)
colnames(moduleTraitCor) = colnames(datTraits)
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))
# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)
# In case there are any NAs
if(sum(!complete.cases(moduleTraitCor))>0){
print(paste0(sum(is.na(moduleTraitCor)),' correlation(s) could not be calculated'))
}
## [1] "1 correlation(s) could not be calculated"
rm(ME_object)
Note: The correlation between Module #9B8EFF and Diagonsis is the one that cannot be calculated, weirdly enough, the thing that causes the error is that the initial correlation is too high, so it would be a very bad thing to lose this module because of this numerical error. I’m going to fill in its value using the polyserial function, which doesn’t give exactly the same results as the hetcor() function, but it’s quite similar.
The sign of this module’s eigengene is enough to classify 94% of the observations correctly! So even though the correlation that the polyserial function assign to it is a bit extreme (1), it makes sense
plot_data = data.frame('ME' = MEs[,'ME#9B8EFF'], 'Diagnosis' = datTraits$Diagnosis)
ggplotly(plot_data %>% ggplot(aes(Diagnosis, ME, fill=Diagnosis)) + geom_boxplot() + theme_minimal() +
theme(legend.position='none') + ylab('ME vals for Module #9B8EFF'))
table(sign(plot_data$ME), plot_data$Diagnosis)
##
## CTL ASD
## -1 34 4
## 1 1 41
moduleTraitCor['ME#9B8EFF','Diagnosis'] = polyserial(MEs[,'ME#9B8EFF'], datTraits$Diagnosis)
## Warning in polyserial(MEs[, "ME#9B8EFF"], datTraits$Diagnosis): initial
## correlation inadmissible, 1.03850783543363, set to 0.9999
Modules have very strong correlations with Diagnosis with really small p-values and not much relation with anything else. Perhaps a little with PMI and Brain Region.
It’s a good sign that the gray module has one of the lowest correlations with diagnosis, since we know its composed mainly of not differentially expressed genes.
The correlation between Diagonsis and the top modules is slightly lower than with the originally assigned modules (~1 percentual point lower)
The six modules with the strongest (absolute) correlation with Diagnosis are the same and they mantain the same order than with the originally assigned modules
moduleTraitPvalue = moduleTraitPvalue[complete.cases(moduleTraitCor),]
moduleTraitCor = moduleTraitCor[complete.cases(moduleTraitCor),]
# Sort moduleTraitCor by Diagnosis
moduleTraitCor = moduleTraitCor[order(moduleTraitCor[,1], decreasing=TRUE),]
moduleTraitPvalue = moduleTraitPvalue[order(moduleTraitCor[,1], decreasing=TRUE),]
# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', round(signif(moduleTraitPvalue, 1),6), ')')
dim(textMatrix) = dim(moduleTraitCor)
labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels = gsub('ME','',rownames(moduleTraitCor)),
yColorWidth=0, colors = brewer.pal(11,'PiYG'), bg.lab.y = gsub('ME','',rownames(moduleTraitCor)),
textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.8, cex.lab.y = 0.75, zlim = c(-1,1),
main = paste('Module-Trait relationships'))
diagnosis_cor = data.frame('Module' = gsub('ME','',rownames(moduleTraitCor)),
'MTcor' = moduleTraitCor[,'Diagnosis'],
'MTpval' = moduleTraitPvalue[,'Diagnosis'])
genes_info = genes_info %>% left_join(diagnosis_cor, by='Module')
rm(moduleTraitCor, moduleTraitPvalue, datTraits, textMatrix, diagnosis_cor)
Modules with a high Module-Diagnosis correlation should have a high content of differentially expressed genes
plot_data = genes_info %>% group_by(Module, MTcor) %>% summarise(p = 100*mean(significant))
plot_data %>% ggplot(aes(MTcor, p)) + geom_hline(yintercept=mean(plot_data$p), color='gray', linetype='dotted') +
geom_point(color=plot_data$Module, aes(id=Module)) + theme_minimal() +
xlab('Modules ordered by Module-Diagnosis correlation') + ylab('Percentage of differentially expressed genes')
Gene significance: is the value between the correlation between the gene and the trait we are interested in. A positive gene significance means the gene is overexpressed and a negative value means its underexpressed. (The term ‘significance’ is not very acurate because it’s not actually measuring statistical significance, it’s just a correlation, but that’s how they call it in WGCNA…)
Module Membership is the correlation of the module’s eigengene and the expression profile of a gene. The higher the Module Membership, the more similar the gene is to the genes that constitute the module. (I won’t use this measure yet)
dataset = read.csv(paste0('./../Data/Gandal/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]
Gene significance and Log Fold Chance are two different ways to measure the same thing, so there should be a concordance between them
Log Fold Chance has some really big outliers, but both variables agree with each other quite well
plot_data = dataset %>% dplyr::select(ID, GS) %>% left_join(genes_info %>% dplyr::select(ID, gene.score), by='ID') %>%
left_join(genes_info %>% dplyr::select(ID, baseMean, log2FoldChange, significant, MTcor, Module), by='ID') %>%
left_join(data.frame(MTcor=unique(genes_info$MTcor)) %>% arrange(by=MTcor) %>%
mutate(order=1:length(unique(genes_info$MTcor))), by='MTcor')
ggplotly(plot_data %>% ggplot(aes(GS, log2FoldChange)) + geom_point(color=plot_data$Module, alpha=0.5, aes(ID=Module)) +
geom_smooth(color='gray') + theme_minimal() + xlab('Gene Significance') +
ggtitle(paste0('Correlation = ', round(cor(plot_data$log2FoldChange, plot_data$GS)[1], 4))))
In general, modules with the highest Module-Diagnosis correlation should have genes with high Gene Significance
Note: For the Module-Diagnosis plots, if you do boxplots, you lose the exact module-diagnosis correlation and you only keep the order, so I decided to compensate this downside with a second plot, where each point is plotted individually using their module’s Module-Diagnosis correlation as the x axis. I think the boxplot plot is easier to understand but the second plot contains more information, so I don’t know which one is better.
Maybe it now looks less noisy? the difference is not that much
The R^2 increased 0.06, from 0.6592 to 0.7233, it’s not much, but it’s good
plot_data = plot_data %>% arrange(order)
ggplotly(plot_data %>% ggplot(aes(order, GS, group=order)) + geom_hline(yintercept=0, color='gray', linetype='dotted') +
geom_boxplot(fill=unique(plot_data$Module)) + theme_minimal() +
xlab('Modules ordered by Module-Diagnosis correlation') + ylab('Gene Significance'))
plot_data %>% ggplot(aes(MTcor, GS)) + geom_hline(yintercept=0, color='gray', linetype='dotted') +
geom_point(color=plot_data$Module, alpha=0.1, aes(id=ID)) + geom_smooth(color='gray', alpha=0.3) +
theme_minimal() + xlab('Module-Diagnosis correlation') + ylab('Gene Significance') +
ggtitle(paste0('R^2=',round(cor(plot_data$MTcor, plot_data$GS)^2,4)))
The same should happen with the Log Fold Change
This plot doesn’t change much with the change of module assignment criteria
The R^2 value increases again, this time from 0.2062 to 0.2235 (increase of 0.02)
ggplotly(plot_data %>% ggplot(aes(order, log2FoldChange, group=order)) + geom_hline(yintercept=0, color='gray', linetype='dotted') +
geom_boxplot(fill=unique(plot_data$Module)) +
theme_minimal() + xlab('Modules ordered by Module-Diagnosis correlation') + ylab('log2FoldChange'))
ggplotly(plot_data %>% ggplot(aes(MTcor, log2FoldChange)) + geom_hline(yintercept=0, color='gray', linetype='dotted') +
geom_point(color=plot_data$Module, alpha=0.1, aes(id=ID)) + geom_smooth(color='gray', alpha=0.3) +
theme_minimal() + xlab('Module-Diagnosis correlation') + ylab('log2FoldChange') +
ggtitle(paste0('R^2=',round(cor(plot_data$MTcor, plot_data$log2FoldChange)^2,4))))
In theory, there shouldn’t be a relation between module-diagnosis and mean expression, but in the the exploratory analysis, we saw that the overexpressed genes tended to have lower levels of expression than the overexpressed genes, and this pattern can be seen in these plots where the modules with negative Module-Diagonsis correlation have slightly higher levels of expression than the modules with positive Module-Diagnosis correlation, although this pattern is note very strong and all modules have similar levels of expression.
ggplotly(plot_data %>% ggplot(aes(order, log2(baseMean+1), group=order)) +
geom_hline(yintercept=mean(log2(plot_data$baseMean+1)), color='gray', linetype='dotted') +
geom_boxplot(fill=unique(plot_data$Module)) + theme_minimal() +
xlab('Modules ordered by Module-Diagnosis correlation') + ylab('log2(Mean Expression)'))
plot_data %>% ggplot(aes(MTcor, log2(baseMean+1))) + geom_point(alpha=0.2, color=plot_data$Module, aes(id=ID)) +
geom_hline(yintercept=mean(log2(plot_data$baseMean+1)), color='gray', linetype='dotted') +
geom_smooth(color='gray', alpha=0.3) + theme_minimal() + xlab('Module-Diagnosis correlation') +
ggtitle(paste0('R^2=',round(cor(plot_data$MTcor, log2(plot_data$baseMean+1))^2,4)))
All of the variables seem to agree with each other, Modules with a high correlation with Diagnosis tend to have genes with high values of Log Fold Change as well as high values of Gene Significance, and the gray module, which groups all the genes that weren’t assigned to any cluster tends to have a very poor performance in all of the metrics.
The correlation between Module assignment and Diagnosis is a bit weaker than before, but I don’t think the difference is very big (~1 percentual point)
There is no significant difference in the concordance between the different variables, but the R^2 value consistently increases with these new Modules (although again, the difference is not big)
Since SFARI scores genes depending on the strength of the evidence linking it to the development of autism, in theory, there should be some concordance between the metrics we have been studying above and these scores…
SFARI scores 1 to 5 have a lower median than all genes that have a neuronal-related annotation (!)
The group with the highest Gene Significance is SFARI score 6, which is supposed to be the one with the least amount of evidence suggesting a relation to autism (!)
SFARI score 1 is the group with the lowest Gene Significance, with a (slightly) lower median than the genes without any type of Neuronal annotation (!)
Neuronal annotated genes have higher Gene Significance than genes without any neuronal-related annotation (makes sense)
This plot doesn’t depend on the assigned module, so there’s no difference between the different module assignment criteria
ggplotly(plot_data %>% ggplot(aes(gene.score, abs(GS), fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
ylab('abs(Gene Significance)') + xlab('SFARI Scores') + theme(legend.position='none'))
The higher the SFARI score, the lower the Module-Trait correlation (!)
SFARI scores 1 and 2 have significantly lower values of Module-Trait correlation than the rest of the groups (!)
The group with the highest Module-Diagnosis correlation is SFARI score 6, which is supposed to be the one with the least amount of evidence suggesting a relation to autism (!)
SFARI score 1 is the group with the lowest Module-Diagonsis correlation, with a median equal to the first quartile of the genes without any type of Neuronal annotation (!)
This pattern is a lot easier to recognise with this module assignment criteria (!), reducing (even more) the module-trait correlation distribution of genes with score 1. This is bad for the model…
ggplotly(plot_data %>% ggplot(aes(gene.score, abs(MTcor), fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
ylab('abs(Module-Trait Correlation)') + xlab('SFARI Scores') + theme(legend.position='none'))
Not only are SFARI genes not consistent with the other measurements, but they seem to strongly contradict them. There is a big difference between all the metrics created from gene expression analysis and these scores.
The problems by SFARI score seem to be stronger here than with the original module assignment (!)
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] WGCNA_1.68 fastcluster_1.1.25 dynamicTreeCut_1.63-1
## [4] doParallel_1.0.15 iterators_1.0.12 foreach_1.4.7
## [7] polycor_0.7-10 expss_0.10.1 GGally_1.4.0
## [10] gridExtra_2.3 viridis_0.5.1 viridisLite_0.3.0
## [13] RColorBrewer_1.1-2 dendextend_1.13.2 plotly_4.9.1
## [16] glue_1.3.1 reshape2_1.4.3 forcats_0.4.0
## [19] stringr_1.4.0 dplyr_0.8.3 purrr_0.3.3
## [22] readr_1.3.1 tidyr_1.0.0 tibble_2.1.3
## [25] ggplot2_3.2.1 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.5
## [3] Hmisc_4.2-0 plyr_1.8.5
## [5] lazyeval_0.2.2 splines_3.6.0
## [7] crosstalk_1.0.0 BiocParallel_1.20.1
## [9] GenomeInfoDb_1.22.0 robust_0.4-18.2
## [11] digest_0.6.23 htmltools_0.4.0
## [13] GO.db_3.10.0 fansi_0.4.1
## [15] magrittr_1.5 checkmate_1.9.4
## [17] memoise_1.1.0 fit.models_0.5-14
## [19] cluster_2.0.8 annotate_1.64.0
## [21] modelr_0.1.5 matrixStats_0.55.0
## [23] colorspace_1.4-1 blob_1.2.0
## [25] rvest_0.3.5 rrcov_1.4-7
## [27] haven_2.2.0 xfun_0.8
## [29] crayon_1.3.4 RCurl_1.95-4.12
## [31] jsonlite_1.6 genefilter_1.68.0
## [33] zeallot_0.1.0 impute_1.60.0
## [35] survival_2.44-1.1 gtable_0.3.0
## [37] zlibbioc_1.32.0 XVector_0.26.0
## [39] DelayedArray_0.12.2 BiocGenerics_0.32.0
## [41] DEoptimR_1.0-8 scales_1.1.0
## [43] mvtnorm_1.0-11 DBI_1.1.0
## [45] Rcpp_1.0.3 xtable_1.8-4
## [47] htmlTable_1.13.1 foreign_0.8-71
## [49] bit_1.1-15.1 preprocessCore_1.48.0
## [51] Formula_1.2-3 stats4_3.6.0
## [53] htmlwidgets_1.5.1 httr_1.4.1
## [55] ellipsis_0.3.0 acepack_1.4.1
## [57] farver_2.0.3 pkgconfig_2.0.3
## [59] reshape_0.8.8 XML_3.98-1.20
## [61] nnet_7.3-12 dbplyr_1.4.2
## [63] locfit_1.5-9.1 later_1.0.0
## [65] tidyselect_0.2.5 labeling_0.3
## [67] rlang_0.4.2 AnnotationDbi_1.48.0
## [69] munsell_0.5.0 cellranger_1.1.0
## [71] tools_3.6.0 cli_2.0.1
## [73] generics_0.0.2 RSQLite_2.2.0
## [75] broom_0.5.3 fastmap_1.0.1
## [77] evaluate_0.14 yaml_2.2.0
## [79] knitr_1.24 bit64_0.9-7
## [81] fs_1.3.1 robustbase_0.93-5
## [83] nlme_3.1-139 mime_0.8
## [85] xml2_1.2.2 compiler_3.6.0
## [87] rstudioapi_0.10 reprex_0.3.0
## [89] geneplotter_1.64.0 pcaPP_1.9-73
## [91] stringi_1.4.5 lattice_0.20-38
## [93] Matrix_1.2-17 vctrs_0.2.1
## [95] pillar_1.4.3 lifecycle_0.1.0
## [97] data.table_1.12.8 bitops_1.0-6
## [99] httpuv_1.5.2 GenomicRanges_1.38.0
## [101] R6_2.4.1 latticeExtra_0.6-28
## [103] promises_1.1.0 IRanges_2.20.2
## [105] codetools_0.2-16 MASS_7.3-51.4
## [107] assertthat_0.2.1 SummarizedExperiment_1.16.1
## [109] DESeq2_1.26.0 withr_2.1.2
## [111] S4Vectors_0.24.2 GenomeInfoDbData_1.2.2
## [113] mgcv_1.8-28 hms_0.5.3
## [115] grid_3.6.0 rpart_4.1-15
## [117] rmarkdown_1.14 Cairo_1.5-10
## [119] shiny_1.4.0 Biobase_2.46.0
## [121] lubridate_1.7.4 base64enc_0.1-3